### ---- SCRIPT 01: The purpose of this script is to pre-process the Twitter/X user data extracted from ---- ###
### ---- https://doi.org/10.7910/DVN/JDB0SE. It normalises the original left-right ideal points onto a  ---- ###
### ---- 0-1 scale, categorises accounts as ordinary/elite, then generates a plot comapring the two     ---- ###
### ---- distributions. Finally, it joins guests with their ideal points and the ideal points of their  ---- ###
### ---- affiliated organisations.                                                                      ---- ###

# Suppress scientific notation
options(scipen=999)

#### ---- LIBRARIES ---- ####

library(dplyr) # data wrangling
library(ggplot2) # data visualisation
library(data.table) # easier to work with large datasets

#### ---- LOAD DATA ---- ####

# Guest attributes
username_list <- fread("username_master_list.csv")

# Organisation attributes
organisation_list <- fread("organisation_master_list.csv")

# Twitter/X user data (redacted)
twitter_user_data <- fread("user_ideal_point_data.csv") 

#### ---- PREPROCESSING ---- ####

# In the Twitter user data, normalise the ideal points onto a bounded range [0,1] using min-max normalisation
min_max_norm <- function(data) {
  # Find the minimum and maximum values
  min_val <- min(data, na.rm = T)
  max_val <- max(data, na.rm = T)
  
  # Normalize the data to the range [0, 1]
  normalized_data <- (data - min_val) / (max_val - min_val)
  
  return(normalized_data)
}

twitter_user_data$user_ideal_point <- min_max_norm(twitter_user_data$svd.phi_1)

# Also, categorise the data into two groups: ordinary accounts and elite accounts. This based on two conditions:
# verified and/or has at least 30,000 followers.
twitter_user_data <- twitter_user_data %>% 
  mutate(elite_account = ifelse(verified == "TRUE" | followers_count >= 30000, "Elite Account", "Ordinary Account"))

# Plot distributions of elite and ordinary users
user_data_ideal_compare <- ggplot(twitter_user_data, aes(x = user_ideal_point, fill = factor(elite_account))) +
  geom_density(alpha = 0.7, bw = "sj") +
  geom_boxplot(width = 0.5, show.legend = FALSE) +
  labs(x = "Ideal Point", 
       fill = "Account Type") +  
  theme_minimal() +  
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    legend.position = "top") +
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  scale_fill_manual(values = c("grey30", "grey80")) +
  scale_x_continuous(breaks = seq(0, 1, 0.25),
                     labels = c("Left", "0.25", "0.5", "0.75", "Right"))

ggsave("ordinary_vs_user_ideal_comparison.png",
       user_data_ideal_compare,
       units="in", width=7, height=4, dpi=300,
       bg="white")

fwrite(twitter_user_data,"user_ideal_point_data_updated.csv")

# Merge the guest and organisation data to their respective Twitter user data via their usernames
username_list <- left_join(username_list,twitter_user_data,by="username",suffix = c("","_twitter"))

organisation_list <- left_join(organisation_list,twitter_user_data,by="username") %>%
  rename(organisation_ideal_point = user_ideal_point)

# Calculate an additional set of ideal points for organisations that uses the mean ideal points of all 
# accounts affiliated with them. This is ascertained by including all elite accounts that @ mention the 
# organisation in their Twitter bio. 
elite_users <- twitter_user_data %>% filter(elite_account == "Elite Account")

organisation_list$username_match <- ifelse(organisation_list$username != "", paste("@", organisation_list$username, sep = ""), NA)
  
affiliated_accounts_df <- data.frame()

for (row in 1:nrow(organisation_list)) {
  affiliated_accounts <- elite_users %>% 
    filter(grepl(organisation_list[row,]$username_match,
                 description,
                 ignore.case = TRUE))
  affiliated_accounts$organisation <- organisation_list[row,]$organisation
  affiliated_accounts_df <- rbind(affiliated_accounts_df,affiliated_accounts)
}

affiliated_accounts_summary <- affiliated_accounts_df %>%
  group_by(organisation) %>%
  summarise(count = n(),
            affiliate_mean_ideal_point = mean(user_ideal_point,na.rm = T),
            affiliate_median_ideal_point = median(user_ideal_point,na.rm = T),
            affiliate_ideal_point_sd = sd(user_ideal_point,na.rm = T))

# Merge the organisation affiliation summary data to the original organisation data
organisation_list <- left_join(organisation_list,affiliated_accounts_summary,by="organisation")

# Finally, merge the organisation data to the guest data by the organisation each guest represents/is affiliated with. This
# gives each guest potentially up to three associated ideal points: their own, their organisation, and the mean of affiliates 
# with their organisation
organisation_ideal_points <- organisation_list %>% select(organisation,organisation_ideal_point,affiliate_mean_ideal_point)

username_list <- left_join(username_list,organisation_ideal_points,by="organisation")

fwrite(username_list,"username_master_list_with_ideal_points.csv")

#### ---- END ---- ####